Working with XML


In [1]:
# xml1.py
# the triple quotes puts the xml data as a string

import xml.etree.ElementTree as ET

data = '''
<person>
  <name>Chuck</name>
  <phone type="intl">
     +1 734 303 4456
   </phone>
   <email hide="yes"/>
</person>'''

tree = ET.fromstring(data)
print('Name:',tree.find('name').text)
print('Attr:',tree.find('email').get('hide'))


Name: Chuck
Attr: yes

In [2]:
# xml2.py
# this one is more complicated since the are more complex tags

import xml.etree.ElementTree as ET

input = '''
<stuff>
    <users>
        <user x="2">
            <id>001</id>
            <name>Chuck</name>
        </user>
        <user x="7">
            <id>009</id>
            <name>Brent</name>
            </user>
        </users>
</stuff>'''

stuff = ET.fromstring(input)
lst = stuff.findall('users/user')
print('User count:', len(lst))

for item in lst:
    print('Name', item.find('name').text)
    print('Id', item.find('id').text)
    print('Attribute', item.get("x"))


User count: 2
Name Chuck
Id 001
Attribute 2
Name Brent
Id 009
Attribute 7

Doing assignment 1

This is an exercise in XML


In [3]:
# http://www.saltycrane.com/blog/2011/07/example-parsing-xml-lxml-objectify/

import pandas as pd
from lxml import etree, objectify

URL = 'http://python-data.dr-chuck.net/comments_42.xml'
URL_real = 'http://python-data.dr-chuck.net/comments_371511.xml'

tree = etree.parse(URL)
string =  etree.tostring(tree.getroot())
root = objectify.fromstring(string)

values = []
for leaf in root.comments.comment:
    #print(e.count.text)
    values.append(int(leaf.count.text))

print(sum(values))


2553

In [4]:
# doing the same thing but using urllib - which is silly since etree.parse is all that is needed

import urllib.request
import pandas as pd
from lxml import etree, objectify

URL = 'http://python-data.dr-chuck.net/comments_42.xml'
URL_real = 'http://python-data.dr-chuck.net/comments_371511.xml'

xml = urllib.request.urlopen(URL_real)
tree = etree.parse(xml)

string =  etree.tostring(tree.getroot())
root = objectify.fromstring(string)

values = []
for e in root.comments.comment:
    #print(e.count.text)
    values.append(int(e.count.text))

print(sum(values))


2393

Working with JSON


In [5]:
# working with what would be called a dicitionary within Python - but in JSON is an object

import json

data = '''
{
  "name" : "Chuck",
  "phone" : {
    "type" : "intl",
    "number" : "+1 734 303 4456"
   },
   "email" : {
     "hide" : "yes"
   }
}'''

info = json.loads(data)
print('Name:',info["name"])
print('Hide:',info["email"]["hide"])


Name: Chuck
Hide: yes

In [6]:
# and here we use what would be called a list (of dictionaries) in Python - but in JSON is an array

import json

input = '''
[
  { "id" : "001",
    "x" : "2",
    "name" : "Chuck"
  } ,
  { "id" : "009",
    "x" : "7",
    "name" : "Chuck"
  } 
]'''

info = json.loads(input)
print('User count:', len(info))

for item in info:
    print('Name', item['name'])
    print('Id', item['id'])
    print('Attribute', item['x'])


User count: 2
Name Chuck
Id 001
Attribute 2
Name Chuck
Id 009
Attribute 7

The Service Oriented Approach

accessing API:s (Application Program Interfaces) in Python

using the google geocode API

In [7]:
'''import urllib.request, urllib.parse, urllib.error
import json

serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?'

while True:
    #address = input('Enter location: ')
    address = 'Ann Arbor, MI'
    if len(address) < 1: break

    url = serviceurl + urllib.parse.urlencode({'address': address})

    print('Retrieving', url)
    uh = urllib.request.urlopen(url)
    data = uh.read().decode()
    print('Retrieved', len(data), 'characters')

    try:
        js = json.loads(data)
    except:
        js = None

    if not js or 'status' not in js or js['status'] != 'OK':
        print('==== Failure To Retrieve ====')
        print(data)
        continue

    lat = js["results"][0]["geometry"]["location"]["lat"]
    lng = js["results"][0]["geometry"]["location"]["lng"]
    print('lat', lat, 'lng', lng)
    location = js['results'][0]['formatted_address']
    print(location)
'''


Out[7]:
'import urllib.request, urllib.parse, urllib.error\nimport json\n\nserviceurl = \'http://maps.googleapis.com/maps/api/geocode/json?\'\n\nwhile True:\n    #address = input(\'Enter location: \')\n    address = \'Ann Arbor, MI\'\n    if len(address) < 1: break\n\n    url = serviceurl + urllib.parse.urlencode({\'address\': address})\n\n    print(\'Retrieving\', url)\n    uh = urllib.request.urlopen(url)\n    data = uh.read().decode()\n    print(\'Retrieved\', len(data), \'characters\')\n\n    try:\n        js = json.loads(data)\n    except:\n        js = None\n\n    if not js or \'status\' not in js or js[\'status\'] != \'OK\':\n        print(\'==== Failure To Retrieve ====\')\n        print(data)\n        continue\n\n    lat = js["results"][0]["geometry"]["location"]["lat"]\n    lng = js["results"][0]["geometry"]["location"]["lng"]\n    print(\'lat\', lat, \'lng\', lng)\n    location = js[\'results\'][0][\'formatted_address\']\n    print(location)\n'
using the Twitter API

In [8]:
'''import urllib.request, urllib.parse, urllib.error
import twurl
import json

TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'

while True:
    print('')
    acct = input('Enter Twitter Account:')
    if (len(acct) < 1): break
    url = twurl.augment(TWITTER_URL,
                        {'screen_name': acct, 'count': '5'})
    print('Retrieving', url)
    connection = urllib.request.urlopen(url)
    data = connection.read().decode()
    headers = dict(connection.getheaders())
    print('Remaining', headers['x-rate-limit-remaining'])
    js = json.loads(data)
    print(json.dumps(js, indent=4))

    for u in js['users']:
        print(u['screen_name'])
        s = u['status']['text']
        print('  ', s[:50])
'''


Out[8]:
"import urllib.request, urllib.parse, urllib.error\nimport twurl\nimport json\n\nTWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'\n\nwhile True:\n    print('')\n    acct = input('Enter Twitter Account:')\n    if (len(acct) < 1): break\n    url = twurl.augment(TWITTER_URL,\n                        {'screen_name': acct, 'count': '5'})\n    print('Retrieving', url)\n    connection = urllib.request.urlopen(url)\n    data = connection.read().decode()\n    headers = dict(connection.getheaders())\n    print('Remaining', headers['x-rate-limit-remaining'])\n    js = json.loads(data)\n    print(json.dumps(js, indent=4))\n\n    for u in js['users']:\n        print(u['screen_name'])\n        s = u['status']['text']\n        print('  ', s[:50])\n"

Doing Assignment 2

finding information in a JSON page


In [9]:
# suggested code for inspiration

import json
from urllib.request import urlopen as uReq

input = '''
[
  { "id" : "001",
    "x" : "2",
    "name" : "Chuck"
  } ,
  { "id" : "009",
    "x" : "7",
    "name" : "Chuck"
  } 
]'''

info = json.loads(input)
print('User count:', len(info))

for item in info:
    print('Name', item['name'])
    print('Id', item['id'])
    print('Attribute', item['x'])


User count: 2
Name Chuck
Id 001
Attribute 2
Name Chuck
Id 009
Attribute 7

In [10]:
# solving the assignment
import json
from urllib.request import urlopen as uReq

sample_data = 'http://python-data.dr-chuck.net/comments_42.json' #(Sum=2553)
actual_data = 'http://python-data.dr-chuck.net/comments_371515.json' #(Sum ends with 77)

# opening up connection, grabbing the page
uClient = uReq(actual_data)
page_json = uClient.read()
uClient.close()

# parse the data
info = json.loads(page_json)

# check if we have all the data
#print('User count:', len(info['comments']))

# loop over the data and collect the values
values = []
for item in info['comments']:
    #print(item['count'])
    values.append(item['count'])
    
print(sum(values))


2677

Doing assignment 3

calling a JSON API


In [11]:
# suggested inspirational code

'''
import urllib
import json

# serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?'
serviceurl = 'http://python-data.dr-chuck.net/geojson?'

while True:
    address = raw_input('Enter location: ')
    if len(address) < 1 : break

    url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address})
    print 'Retrieving', url
    uh = urllib.urlopen(url)
    data = uh.read()
    print 'Retrieved',len(data),'characters'

    try: js = json.loads(str(data))
    except: js = None
    if 'status' not in js or js['status'] != 'OK':
        print '==== Failure To Retrieve ===='
        print data
        continue

    print json.dumps(js, indent=4)

    lat = js["results"][0]["geometry"]["location"]["lat"]
    lng = js["results"][0]["geometry"]["location"]["lng"]
    print 'lat',lat,'lng',lng
    location = js['results'][0]['formatted_address']
    print location
'''


Out[11]:
'\nimport urllib\nimport json\n\n# serviceurl = \'http://maps.googleapis.com/maps/api/geocode/json?\'\nserviceurl = \'http://python-data.dr-chuck.net/geojson?\'\n\nwhile True:\n    address = raw_input(\'Enter location: \')\n    if len(address) < 1 : break\n\n    url = serviceurl + urllib.urlencode({\'sensor\':\'false\', \'address\': address})\n    print \'Retrieving\', url\n    uh = urllib.urlopen(url)\n    data = uh.read()\n    print \'Retrieved\',len(data),\'characters\'\n\n    try: js = json.loads(str(data))\n    except: js = None\n    if \'status\' not in js or js[\'status\'] != \'OK\':\n        print \'==== Failure To Retrieve ====\'\n        print data\n        continue\n\n    print json.dumps(js, indent=4)\n\n    lat = js["results"][0]["geometry"]["location"]["lat"]\n    lng = js["results"][0]["geometry"]["location"]["lng"]\n    print \'lat\',lat,\'lng\',lng\n    location = js[\'results\'][0][\'formatted_address\']\n    print location\n'

In [27]:
# solving the assignment
import json
from urllib.request import urlopen as uReq
from urllib.parse import urlencode as uEncode

# The program will prompt for a location
#address = input('Enter location: ')
address = 'kansas state university'

# contact a web service and retrieve JSON for the web service and parse that data
endpoint = 'http://python-data.dr-chuck.net/geojson?' # we use this API endpoint with a static subset of the Google Data:
url = endpoint + uEncode({'sensor':'false', 'address': address})
print('Retrieving', url)

uClient = uReq(url)
data = uClient.read()
uClient.close()
print('Retrieved',len(data),'characters')

# and retrieve the first place_id from the JSON
info = json.loads(data)
place_id = info["results"][0]["place_id"]
print(place_id)


Retrieving http://python-data.dr-chuck.net/geojson?sensor=false&address=kansas+state+university
Retrieved 1972 characters
ChIJ39MaXcPguocRlRSBbdscXis

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: